#import the necessary packages
import pandas as pd
import numpy as np
import numpy.linalg as ln
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
from bokeh.io import show, output_notebook, push_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KDTree
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 10)
#pd.set_option('display.max_rows', None)
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import SpectralClustering
from sklearn.cluster import Birch
from sklearn.cluster import KMeans
# import the necessary library for Kendall correlation coefficient
from scipy.stats import kendalltau
# import the necessary library for Spearman correlation coefficient
from scipy.stats import spearmanr
# import the necessary library for Pearson correlation coefficient
from scipy.stats import pearsonr
# Load the data to put in model
df = pd.read_csv("df_final17_04_2020.csv" , sep='\t')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
# Load the data to extract the employees information
personal_info1 = pd.read_csv("personal_info.csv" , sep='\t')
personal_info1.drop(['Unnamed: 0'], axis=1, inplace=True)
personal_info1.tail(2)
# Define the experience range of each employee into year interval
df['date_exp']=round(df['date_exp']/12 , 2)
df.head(5)
# Concatinate all the importatnt data (model data + information data)
df_final=pd.concat([df,personal_info1['name']],axis=1)
df_final.head(30)
# read the excel file to use both the employee profile skills with their correspond coefficient
exel_ponderation= pd.read_excel("wevioo rh skills with coefs.xlsx")
exel_ponderation
# calculate the skills coefficients dynamically according to the profile job
# Nb : we calculate both the professional skills and the academic skills
for i in range(exel_ponderation.shape[0]):
for j in range(df_final.shape[0]):
if df_final['job2'][j].lower()== exel_ponderation['profile'][i].lower():
df_final[exel_ponderation['technologies'][i].lower()][j]=df_final[exel_ponderation['technologies'][i].lower()][j]*exel_ponderation['coef'][i]
df_final[exel_ponderation['technologies'][i].lower()+'_academic'][j]=df_final[exel_ponderation['technologies'][i].lower()+'_academic'][j]*exel_ponderation['coef'][i]
df_final.head(50)
Q=df_final.iloc[:,:-3]
## T-Sne is a dimension reduction machine learning algorithm to visualise the profile distribution
A=df_final.iloc[:,:-2].to_numpy()
A
A_list=A.tolist()
# Dimension reduction with 2 t-SNE components
model = TSNE(n_components=2,learning_rate=200,random_state=42,n_iter=300)
tsne_features = model.fit_transform(A)
# Make X, Y columns
df_final['X'] = tsne_features[: ,0]
df_final['Y'] = tsne_features[: ,1]
# Dimension reduction with 3 t-SNE components
model3D = TSNE(n_components=3,learning_rate=200,random_state=42,n_iter=300)
tsne_features3D = model3D.fit_transform(A)
# Make X, Y, Z columns
df_final['X3D'] = tsne_features3D[: ,0]
df_final['Y3D'] = tsne_features3D[: ,1]
df_final['Z3D'] = tsne_features3D[: ,2]
#use bokeh as our first visualization tool to the t-SNE distribution
output_notebook()
# Make a source and a scatter plot
source = ColumnDataSource(df_final)
plot = figure(x_axis_label = "T-SNE 1",
y_axis_label = "T-SNE 2",
width = 500, height = 400)
plot.circle(x = 'X',
y = 'Y',
source = source,
size = 4, color = '#084594'
, alpha = .8)
# Create a HoverTool object to visualize each scatter information (name + job)
hover = HoverTool(tooltips = [('name', '@name'),
('job2', '@job2')])
plot.add_tools(hover)
# show our first visualization
show(plot)